Load the necessary libraries
library(mgcv) #for GAMs
library(gratia) #for GAM plots
library(emmeans) #for marginal means etc
library(broom) #for tidy output
library(MuMIn) #for model selection and AICc
library(lubridate) #for processing dates
library(mapdata)
library(maps)
library(tidyverse) #for data wrangling
library(DHARMa) #for residual diagnostics
library(performance)
library(see)
library(sf)
library(stars)
library(rnaturalearth)
library(rnaturalearthdata)
library(raster)
library(ggspatial)
library(patchwork)
theme_set(theme_classic())
Paruelo and Lauenroth (1996) analyzed the geographic distribution and the effects of climate variables on the relative abundance of a number of plant functional types (PFT’s) including shrubs, forbs, succulents (e.g. cacti), C3 grasses and C4 grasses. They used data from 73 sites across temperate central North America (see pareulo.csv) and calculated the relative abundance of C3 grasses at each site as a response variable
grass
Format of paruelo.csv data file
| c3 | lat | long | map | mat | jjamap | djfmap |
|---|---|---|---|---|---|---|
| ... | ... | ... | ... | ... | ... | ... |
| c3 | - Relative abundance of c3 grasses at each site - response variable |
| lat | - Latitudinal coordinate |
| long | - Longitudinal coordinate |
| map | - Mean annual precipitation |
| mat | - Mean annual temperature |
| jjamap | - Mean annual precipitation in June, July, August |
| djfmap | - Mean annual precipitation in December, January, February |
paruelo <- read_csv('../data/paruelo.csv', trim_ws=TRUE) %>%
janitor::clean_names()
glimpse(paruelo)
## Rows: 73
## Columns: 7
## $ c3 <dbl> 0.65, 0.65, 0.76, 0.75, 0.33, 0.03, 0.00, 0.02, 0.05, 0.05, 0.…
## $ lat <dbl> 46.40, 47.32, 45.78, 43.95, 46.90, 38.87, 32.62, 36.95, 35.30,…
## $ long <dbl> 119.55, 114.27, 110.78, 101.87, 102.82, 99.38, 106.75, 96.55, …
## $ map <dbl> 199, 469, 536, 476, 484, 623, 259, 969, 542, 421, 446, 376, 66…
## $ mat <dbl> 12.4, 7.5, 7.2, 8.2, 4.8, 12.0, 14.5, 15.3, 13.9, 8.5, 5.1, 11…
## $ jjamap <dbl> 0.12, 0.24, 0.24, 0.35, 0.40, 0.40, 0.47, 0.30, 0.44, 0.31, 0.…
## $ djfmap <dbl> 0.45, 0.29, 0.20, 0.15, 0.14, 0.11, 0.17, 0.14, 0.13, 0.14, 0.…
canada <- ne_countries(country = c("united states of america", "canada"), scale = "medium", returnclass = "sf") %>%
st_crop(xmin = -130, xmax = -60,
ymin = 0, ymax = 80)
glimpse(canada)
## Rows: 2
## Columns: 64
## $ scalerank <int> 1, 1
## $ featurecla <chr> "Admin-0 country", "Admin-0 country"
## $ labelrank <dbl> 2, 2
## $ sovereignt <chr> "Canada", "United States of America"
## $ sov_a3 <chr> "CAN", "US1"
## $ adm0_dif <dbl> 0, 1
## $ level <dbl> 2, 2
## $ type <chr> "Sovereign country", "Country"
## $ admin <chr> "Canada", "United States of America"
## $ adm0_a3 <chr> "CAN", "USA"
## $ geou_dif <dbl> 0, 0
## $ geounit <chr> "Canada", "United States of America"
## $ gu_a3 <chr> "CAN", "USA"
## $ su_dif <dbl> 0, 0
## $ subunit <chr> "Canada", "United States of America"
## $ su_a3 <chr> "CAN", "USA"
## $ brk_diff <dbl> 0, 0
## $ name <chr> "Canada", "United States"
## $ name_long <chr> "Canada", "United States"
## $ brk_a3 <chr> "CAN", "USA"
## $ brk_name <chr> "Canada", "United States"
## $ brk_group <chr> NA, NA
## $ abbrev <chr> "Can.", "U.S.A."
## $ postal <chr> "CA", "US"
## $ formal_en <chr> "Canada", "United States of America"
## $ formal_fr <chr> NA, NA
## $ note_adm0 <chr> NA, NA
## $ note_brk <chr> NA, NA
## $ name_sort <chr> "Canada", "United States of America"
## $ name_alt <chr> NA, NA
## $ mapcolor7 <dbl> 6, 4
## $ mapcolor8 <dbl> 6, 5
## $ mapcolor9 <dbl> 2, 1
## $ mapcolor13 <dbl> 2, 1
## $ pop_est <dbl> 33487208, 313973000
## $ gdp_md_est <dbl> 1300000, 15094000
## $ pop_year <dbl> NA, 0
## $ lastcensus <dbl> 2011, 2010
## $ gdp_year <dbl> NA, 0
## $ economy <chr> "1. Developed region: G7", "1. Developed region: G7"
## $ income_grp <chr> "1. High income: OECD", "1. High income: OECD"
## $ wikipedia <dbl> NA, 0
## $ fips_10 <chr> NA, NA
## $ iso_a2 <chr> "CA", "US"
## $ iso_a3 <chr> "CAN", "USA"
## $ iso_n3 <chr> "124", "840"
## $ un_a3 <chr> "124", "840"
## $ wb_a2 <chr> "CA", "US"
## $ wb_a3 <chr> "CAN", "USA"
## $ woe_id <dbl> NA, NA
## $ adm0_a3_is <chr> "CAN", "USA"
## $ adm0_a3_us <chr> "CAN", "USA"
## $ adm0_a3_un <dbl> NA, NA
## $ adm0_a3_wb <dbl> NA, NA
## $ continent <chr> "North America", "North America"
## $ region_un <chr> "Americas", "Americas"
## $ subregion <chr> "Northern America", "Northern America"
## $ region_wb <chr> "North America", "North America"
## $ name_len <dbl> 6, 13
## $ long_len <dbl> 6, 13
## $ abbrev_len <dbl> 4, 6
## $ tiny <dbl> NA, NA
## $ homepart <dbl> 1, 1
## $ geometry <MULTIPOLYGON [°]> MULTIPOLYGON (((-60 43.9057..., MULTIPOLYGON …
spain <- ne_countries(country = c("spain", "italy"), scale = "medium", returnclass = "sf")
ggplot() +
geom_sf(data = canada)
The map is with a spatial features mapping, so it knows what the shape and coordinate reference system is from the sf object.
paruelo %>% head()
Note that the C3 data is % cover. To do percent cover, we need to ensure we do not have 0 or 100% cover ever. We also need to ensure that, when projecting onto a map, we have the same coordinate system.
paruelo <- paruelo %>%
mutate(c3nz = ifelse(c3 == 0, 0.01, c3),
long = -long)
paruelo_sf <- paruelo %>%
st_as_sf(coords = c("long", "lat"), crs = st_crs(canada))
ggplot() +
geom_sf(data = canada, aes(fill = adm0_a3_is), color = "white") +
scale_fill_manual(values = c("red", "light blue"), guide = "none") +
geom_sf(data = paruelo_sf, aes(col = c3, size = c3), alpha = 0.9) +
scale_color_viridis_c() +
coord_sf(expand = FALSE)
ggplot(paruelo) +
geom_histogram(aes(x = c3))
We will focus on the spatial components.
Model formula: \[ y_i \sim{} \mathcal{N}(\mu_i, \sigma^2)\\ \mu_i =\beta_0 + f(Long_i) + f(Lat_i) + f(Long_i, Lat_i) \]
where \(\beta_0\) is the y-intercept. \(f(Lat)\) and \(f(Long)\) indicate the additive smoothing functions of the spatial predictors.
s smooth using lat and long (must be in same scale)paruelo_gam1 <- gam(c3nz ~ s(long, lat), data = paruelo,
family = betar, method = "REML")
Note that we can combine the two smoother terms together because the two are on the same scale. This is only possible because the projection is to a flat, Mercator projection surface.
k.check(paruelo_gam1)
## k' edf k-index p-value
## s(long,lat) 29 4.848145 0.898749 0.1025
Looks like it is ok for not being overconstrained.
simulateResiduals(paruelo_gam1, plot = T)
We are unable to get the residuals because of the model family, but DHARMa has its own function for creating DHARMa-related functions from the model we select!
sim_paruelo_resids <- createDHARMa(
simulatedResponse = simulate(paruelo_gam1, nsim = 250),
observedResponse = paruelo$c3nz,
fittedPredictedResponse = predict(paruelo_gam1))
plot(sim_paruelo_resids)
Looks great!
te tensor products for two with different scalesAnother way of modelling is using tensor product smooths: bs = "te". These are especially useful when the two terms are on different scales.
paruelo_gam2 <- gam(c3nz ~ te(long, lat), data = paruelo,
family = betar, method = "REML")
createDHARMa(
simulatedResponse = simulate(paruelo_gam2, nsim = 250),
observedResponse = paruelo$c3nz,
fittedPredictedResponse = predict(paruelo_gam2)) %>% plot
k.check(paruelo_gam2)
## k' edf k-index p-value
## te(long,lat) 24 3.00004 0.7814077 0.015
All looks good! Note that edf is way down now!
However, the output of both of these models is a 2D smoother. If we want to split them both up, we use a tensor product interaction: bs = "ti", to see the differences for each component.
ti for tensor interactionsparuelo_gam3 <- gam(c3nz ~ ti(long) + ti(lat) + ti(long, lat), data = paruelo,
family = betar, method = "REML")
createDHARMa(
simulatedResponse = simulate(paruelo_gam3, nsim = 250),
observedResponse = paruelo$c3nz,
fittedPredictedResponse = predict(paruelo_gam3)) %>% plot
k.check(paruelo_gam3)
## k' edf k-index p-value
## ti(long) 4 1.000019 0.8673167 0.1250
## ti(lat) 4 1.000024 0.9588562 0.3400
## ti(long,lat) 16 3.523281 0.9956490 0.3775
All looks good!
gratia::draw()draw(paruelo_gam1) # thin plate spline
draw(paruelo_gam2) # tensor product
draw(paruelo_gam3) # tensor product interaction
In the third plot, it shows the effect of long when using the average lat, the effect of lat when using the average long, and the effect of the two combined together!
mgcv::vis.gam()par(mfrow = c(2,3))
vis.gam(paruelo_gam1, theta=30)
vis.gam(paruelo_gam2, theta=30)
vis.gam(paruelo_gam3, theta=30)
vis.gam(paruelo_gam1, theta=-30)
vis.gam(paruelo_gam2, theta=-30)
vis.gam(paruelo_gam3, theta=-30)
par(mfrow = c(1,1))
summary(paruelo_gam1)
##
## Family: Beta regression(4.304)
## Link function: logit
##
## Formula:
## c3nz ~ s(long, lat)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1021 0.1081 -10.19 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(long,lat) 4.848 6.815 59.88 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.455 Deviance explained = 56.3%
## -REML = -45.112 Scale est. = 1 n = 73
Says that there is a certain degree of wiggliness
Conclusions:
tidy(paruelo_gam1)
summary(paruelo_gam2)
##
## Family: Beta regression(4.366)
## Link function: logit
##
## Formula:
## c3nz ~ te(long, lat)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1120 0.1077 -10.32 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## te(long,lat) 3 3 62.04 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.466 Deviance explained = 55.4%
## -REML = -52.794 Scale est. = 1 n = 73
Conclusions:
tidy(paruelo_gam2)
summary(paruelo_gam3)
##
## Family: Beta regression(5.099)
## Link function: logit
##
## Formula:
## c3nz ~ ti(long) + ti(lat) + ti(long, lat)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.2330 0.1151 -10.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## ti(long) 1.000 1.000 0.378 0.53843
## ti(lat) 1.000 1.000 73.604 < 2e-16 ***
## ti(long,lat) 3.523 3.875 15.070 0.00298 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.535 Deviance explained = 63.6%
## -REML = -52.2 Scale est. = 1 n = 73
Clear evidence of an interaction between long and lat affecting the values.
Conclusions:
tidy(paruelo_gam3)
paruelo_list <- with(paruelo,
list(lat = modelr::seq_range(lat, n=100),
long = modelr::seq_range(long, n=100)))
newdata <-
emmeans(paruelo_gam3, ~long+lat, at = paruelo_list, type='response') %>%
as.data.frame %>%
rename(c3 = response, lwr = lower.CL, upr = upper.CL)
newdata %>% head
newdata %>%
ggplot(aes(y = lat, x = long)) +
geom_tile(aes(fill = c3)) +
geom_contour(aes(z = c3)) +
scale_fill_gradientn(colors = heat.colors(10)) +
geom_point(data = paruelo, aes(fill = c3), shape = 21, size = 5) +
coord_equal()
newdata_sf <-
newdata %>%
dplyr::select(long, lat, c3) %>%
rasterFromXYZ() %>%
mask(canada) %>%
st_as_stars() %>%
st_set_crs(st_crs(canada))
## OR
#newdata.sf <- newdata %>%
# st_as_sf(coords=c("long", "lat"), crs=st_crs(canada)) %>%
# st_rasterize()
ggplot() +
geom_sf(data=canada) +
geom_stars(data=newdata_sf) +
scale_color_viridis_c() +
coord_sf(expand = FALSE) +
geom_sf(data = paruelo_sf, aes(col = c3, size = c3), alpha = 0.9)
ggplot() +
geom_sf(data = canada) +
geom_stars(data = newdata_sf) +
scale_fill_viridis_c() +
geom_sf(data=paruelo_sf, aes(fill = c3), shape = 21, size = 4) +
annotation_scale(location = "bl", width_hint = 0.25) +
annotation_north_arrow(location = "bl", which_north = "true",
pad_x = unit(0.25, "in"), pad_y = unit(0.2, "in"),
style = north_arrow_fancy_orienteering) +
coord_sf(expand=FALSE, ylim = c(20, 60)) +
theme(axis.title=element_blank(),
legend.position=c(0.99, 0), legend.justification=c(1, 0))